# DLI regression analysis
# Sarah Bratt
#Oct 2 2024

library(dplyr)

setwd( "C:/Users/sebratt/DropBox")
final_df <- read.csv("final_df_DLI_regression.csv", sep=",", header =TRUE)
dim(final_df)

uniq_df <- final_df[!duplicated(final_df$ssid),]
rm(final_df)
####### DATA CLEANING ####################

# Check the year distribution 
tab <- as.data.frame(table(uniq_df$year))
hist(uniq_df$year)

# There are years 1955-2021.Weird. 
# What are the early years?

# How to handle missing data? How many have citation information? 
t <- as.data.frame(table(uniq_df$citationcount))
dim(uniq_df)[1] - dim(cite_df)[1] # There are 92677 ssids without citation information. 
summary(cite_df$year)

cite_df2 <- uniq_df[which(uniq_df$citationcount != ""),]
cite_df3 <- cite_df2[which(cite_df2$journalimpactfactor != "-1"),] # there are 6616 obj with -1 for JIF
cite_df3 <- cite_df3[which(cite_df3$year >= 1992),]


#cite_df4 <- cite_df3[grep("Biology,Medicine|Biology|Medicine", cite_df3$field_of_study_2), ] 
biology_medicine <- cite_df3[which(cite_df3$field_of_study_2 =="Biology,Medicine"),]
biology <- cite_df3[which(cite_df3$field_of_study_2 =="Biology"),]
medicine <- cite_df3[which(cite_df3$field_of_study_2 =="Medicine"),]

cite_df4 <- rbind(biology,biology_medicine, medicine)

cite_df4$X <- NULL

write.csv2(cite_df4, "DLI_regression.csv")

rm(uniq_df)
rm(cite_df2, cite_df3, biology_medicine, medicine, biology)
gc()

fields <- as.data.frame(table(cite_df3$field_of_study_2)) 
#(184938+16442+6882+5366)/216149
#Biology, Medicine (184938) 
#Biology (16442)
#Medicine (6882)
#Chemistry, Medicine (5366)

# LIC scientists capacity building (a) new collaborators; and (b) lead-authored pubs 
LIC_df <- uniq_df[ grep("Low income", uniq_df$income_class_combined_pub) , ] 
Low_df <- uniq_df[ grep("Low", uniq_df$income_class_combined_pub) , ] 

(dim(LIC_df)[1] / dim(uniq_df)[1])*100 # 1.15 % of total ssids are LIC 
(dim(Low_df)[1] /dim(uniq_df)[1])*100 # 7.04 % of the total ssids are LMIC OR LIC 


######### REGRESSION #####################
#install.packages("faraway")
#install.packages("car")
library(faraway)
library(car)

data <-cite_df4
#data <- read.csv2("C:/Users/sebratt/DropBox/DLI_cite_regression.csv")


#Inspect and clean
str(data)
colnames(data)
data$X <- NULL
data$ssid <- NULL
data$DLI <- as.numeric(data$DLI)
data$teamsize <- (data$count_publications + data$count_datasets)
data$count_publications <- NULL
data$count_datasets <- NULL
#data$author_id <- NULL
data$citationcount <- as.numeric(data$citationcount)
data$field_of_study_2 <- as.factor(data$field_of_study_2)
data$year <- as.numeric(data$year) 
data$sd_career_age <- as.numeric(data$sd_career_age) 
data$mean_career_age <- as.numeric(data$mean_career_age)
data$type <- NULL

getwd()
write.csv2(data, "C:/Users/sebratt/DropBox/dli_data_regress_clean.csv")

#Transformations: Create dummy HIC and SAC columns 
data$SAC <- ifelse(grepl("SAC", data$sci_capacity_combined_pub), "1", "0")
table(data$SAC)[1]/table(data$SAC)[2] # 25.3% are not SAC
data$HIC <- ifelse(grepl("High income", data$income_class_combined_pub), "1", "0")
table(data$HIC)[1]/table(data$HIC)[2] # 20.7% are not SAC

# # Perform diagnostics and transformations; check normality of citation count and DLI
#Normality of citation count. Add 10 to make 0s positive and 
summary(data$citationcount) # holy moses 16,589 citations max 
hist(data$citationcount) # quite right skewed 

# wow there are some outliars
df <- data[which(data$citationcount <= 10000),]
df$cite_count10 <- df$citationcount+10

# Every tiem you transform the var, you have to explain it back. 
# E.g., The IDVs needed to be log and units of change. 
# Assumptions of normality only depends for the DV (citation counts) 
df$citation_log <- log10(df$cite_count10)
hist(df$citation_log)
summary(df$citation_log)


qqnorm(df$citationcount, main='Normal')
qqline(normal_data)

samp_df<-sample_n(df, 5000)

shapiro.test(as.numeric(samp_df$DLI_log))
# p-value < 2.2e-16

# DLI
hist(df$DLI)
df$DLI_10 <- df$DLI+10
df$DLI_log <- log10(df$DLI_10)

attach(df)
plot(citationcount ~ DLI)


options(scipen=999)
# Model 
attach(df)
model <- lm(citation_log ~ DLI_log+year+journalimpactfactor+teamsize+SAC+field_of_study_2+sd_career_age+mean_career_age, data=df)


summary(model)
model$coefficients
model$effects
plot(model$residuals)

# multi-collinearity test
#library(car)
vif(model)

#conf.ints
confint(model, level = 0.95)


plot(jitter(DLI),jitter(citationcount), col='steelblue')
plot(jitter(DLI_log),jitter(citation_log), col='dodgerblue')
par(mfrow =c(1,2))

samp_df<-sample_n(df, 5000)
plot(jitter(samp_df$DLI),jitter(samp_df$citationcount), col='steelblue')

write.csv2(df, "regression_DLI_df_with_log.csv")
rm(samp)
rm(fields,cite_df2, cite_df3, biology, medicine, biology_medicine,data)
gc()












